Data Quality Report

In this notebook we are going to create a data quality report piece-by-piece and then put it all together at the end.


In [ ]:
# Import the libraries we need
import pandas as pd

In [ ]:
# Import the data from the source CSV file the 'Create a Pandas DataFrame From a CSV File' recipe and take a peek
accidents_data_file = '/Users/robert.dempsey/Dropbox/Private/Art of Skill Hacking/' \
                      'Books/Python Business Intelligence Cookbook/Data/Stats19-Data1979-2004/Accidents7904.csv'
accidents = pd.read_csv(accidents_data_file,
                        sep=',',
                        header=0,
                        index_col=False,
                        parse_dates=['Date'],
                        dayfirst=True,
                        tupleize_cols=False,
                        error_bad_lines=True,
                        warn_bad_lines=True,
                        skip_blank_lines=True,
                        low_memory=False
                        )
accidents.head()

Available Columns


In [ ]:
# Create a DataFrame of the columns in the accidents dataframe
columns = pd.DataFrame(list(accidents.columns.values))
columns

Data Types


In [ ]:
# Create a DataFrame of the data type of each column
data_types = pd.DataFrame(accidents.dtypes,
                          columns=['Data Type'])
data_types

Count of Missing Values in Each Column


In [ ]:
# Create a DataFrame with the count of missing values in each column
missing_data_counts = pd.DataFrame(accidents.isnull().sum(),
                                   columns=['Missing Values'])
missing_data_counts

Count of Present Values in Each Column


In [ ]:
# Create a DataFrame with the count of present values in each column
present_data_counts = pd.DataFrame(accidents.count(),
                                   columns=['Present Values'])
present_data_counts

Number of Unique Values Per-Column


In [ ]:
# Create DataFrame with the count of unique values in each column
unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(accidents.columns.values):
    unique_value_counts.loc[v] = [accidents[v].nunique()]
unique_value_counts

The Minimum Value In Each Column


In [ ]:
# Create a DataFrame with the minimum value in each column
minimum_values = pd.DataFrame(columns=['Minimum Value'])
for v in list(accidents.columns.values):
    minimum_values.loc[v] = [accidents[v].min()]
minimum_values

The Maximum Value In Each Column


In [ ]:
# Create a DataFrame with the minimum value in each column
maximum_values = pd.DataFrame(columns=['Maximum Value'])
for v in list(accidents.columns.values):
    maximum_values.loc[v] = [accidents[v].max()]
maximum_values

Bring It All Together


In [ ]:
# Merge all the DataFrames together by the index
data_quality_report = data_types.join(present_data_counts).join(missing_data_counts).join(unique_value_counts)

In [ ]:
# Print out a nice report
print("\nData Quality Report")
print("Total records: {}".format(len(accidents.index)))
data_quality_report

In [ ]: